# -*- coding: utf-8 -*-
"""
Created on Mon Jan 3 16:28:27 2022
@author: 玦祎
"""
import re
import pandas as pd
import openpyxl
import time
import requests
xlsx = '半导体行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)#工作簿
sheet = exf.active#只提取在处理的表
C2 = sheet['C2']#
C = sheet['C']
links = [c.value for c in C]
links_1=links[1:-1]
links_2=''.join(links_1)
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p. findall(links_2)
df2 = pd.DataFrame({'link': [t[0] for t in list_of_tuple],'f_name': [t[1] for t in list_of_tuple]})
df2.to_csv('半导体行业.csv')
import re
import pandas as pd
import openpyxl
import os
f = open('半导体行业.csv',encoding='utf-8')
df = pd.read_csv(f)
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig = filter_links(['(','('],df_all,include=(False))
df_updt =filter_links(['(','('],df_all,include=(True))
df_updt =filter_links(['取消'],df_updt,include=(False))
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig = []
index_updt = []
for i, f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_newest)
df_newest =sub_with_update(df_updt, df_orig)
df_newest.sort_values(by=['f_name'],
inplace=True,ignore_index=True)
df_newest['公司简称']=[f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn], df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv =os.listdir('10companies')
#将不同公司的年报链接分别储存在不同的csv文件
#运行选出的公司有两家公司通过代码无法获得年报链接的csv,手动删除
# 通过提取下载后csv文件里的链接,下载各家公司各个年份的年报pdf文件
import re
import os
import requests
import pandas as pd
import time
#利用for循环对文件夹中十个csv文件分别处理获取链接
for info in os.listdir('10companies'):
domain = os.path.abspath(r'10companies') #获取文件夹的路径
info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径
df = pd.read_csv(info)
links = df["link"];f_names = df["f_name"]
def get_PDF_url(url):
r = requests.get(url); r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('(.*?)', re.DOTALL)
a = p.search(html) # 因第一个即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs = []; fnames = []
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
time.sleep(0)
df_final_links = pd.DataFrame({'href': hrefs,
'f_name': fnames})
ste = info[-8:-4]#将各个公司的名称赋予ste变量
df_final_links.to_csv("final_links_"+ste+".csv")
import os
import requests
import pandas as pd
import time
for info in os.listdir('10companies'):#通过for循环对不同csv文件分别进行处理
domain = os.path.abspath(r'10companies') #获取文件夹的路径
info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径
df = pd.read_csv(info)
ste = info[-8:-4]
df_final_links = pd.read_csv("final_links_"+ste+".csv")
hrefs = df_final_links["href"]
f_names = df_final_links["f_name"]
for i in range(len(hrefs)):#对每个csv文件中已生成的链接通过for循环进行下载
href = hrefs[i];f_name = f_names[i]
r = requests.get(href,allow_redirects=True)
open('%s'%f_name,'wb').write(r.content)
time.sleep(0)
r.close()
import os
import re
import fitz # pip install pymupdf
import csv
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #确保显示中文
plt.rcParams['axes.unicode_minus'] = False #确保显示负数的参数设置
filenames = os.listdir()
prefix = '年度报告'
pdf = [f for f in filenames if f.endswith('.pdf')]
year = [f[-13:-4] for f in pdf]
gs=[f[:4] for f in pdf]
def getText(pdf):
text = ''
doc = fitz.open(pdf)
for page in doc:
text += page.getText()
doc.close()
return(text)
text = [getText(f) for f in pdf]
text[0]
def get_content(pdf):
text = getText(pdf)
p = re.compile('第二节\s*公司简介和主要财务指标(.*)第三节\s*公司业务概要',re.DOTALL)
content = p.search(text).group(0)
return(content)
content=[get_content(f) for f in pdf]